1. Data Understanding & Preparation¶
1.1 Sources & Scope¶
- 4 HDB resale CSVs (2000–2024), ~628k rows, 26 towns, 7 flat types.
In [7]:
from pathlib import Path
import pandas as pd
data_dir = Path(".") # CSVs live alongside the notebook
files = [
"resale-flat-prices-approval-2000-2005.csv",
"resale-flat-prices-approval-2006-2012.csv",
"resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv",
"resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv",
"ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv",
]
# (Keep the rest of your cleaning + concat loop the same, using pd.read_csv(data_dir / f))
In [6]:
data_dir = Path(".")
files = sorted(p.name for p in data_dir.glob("*.csv"))
print(files) # sanity check
df_list = []
expected_cols = [
"month","town","flat_type","block","street_name","storey_range",
"floor_area_sqm","flat_model","lease_commence_date","resale_price","remaining_lease",
]
for f in files:
pdf = pd.read_csv(data_dir / f, encoding="utf-8", low_memory=False)
pdf.columns = pdf.columns.str.strip().str.lower().str.replace(" ", "_", regex=False)
for col in expected_cols:
if col not in pdf.columns:
pdf[col] = pd.NA
pdf["month"] = pd.to_datetime(pdf["month"], errors="coerce")
pdf["resale_price"] = pd.to_numeric(pdf["resale_price"], errors="coerce")
pdf["floor_area_sqm"] = pd.to_numeric(pdf["floor_area_sqm"], errors="coerce")
pdf["lease_commence_date"] = pd.to_numeric(pdf["lease_commence_date"], errors="coerce")
pdf["source_file"] = f
df_list.append(pdf)
df = pd.concat(df_list, ignore_index=True)
print("✅ Combined data shape:", df.shape)
df.head()
['ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv', 'resale-flat-prices-approval-2000-2005.csv', 'resale-flat-prices-approval-2006-2012.csv', 'resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv', 'resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv'] ✅ Combined data shape: (628591, 12)
Out[6]:
| month | town | flat_type | block | street_name | storey_range | floor_area_sqm | flat_model | lease_commence_date | remaining_lease | resale_price | source_file | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017-01-01 | ANG MO KIO | 2 ROOM | 406 | ANG MO KIO AVE 10 | 10 TO 12 | 44.0 | Improved | 1979 | 61 years 04 months | 232000.0 | ResaleflatpricesbasedonregistrationdatefromJan... |
| 1 | 2017-01-01 | ANG MO KIO | 3 ROOM | 108 | ANG MO KIO AVE 4 | 01 TO 03 | 67.0 | New Generation | 1978 | 60 years 07 months | 250000.0 | ResaleflatpricesbasedonregistrationdatefromJan... |
| 2 | 2017-01-01 | ANG MO KIO | 3 ROOM | 602 | ANG MO KIO AVE 5 | 01 TO 03 | 67.0 | New Generation | 1980 | 62 years 05 months | 262000.0 | ResaleflatpricesbasedonregistrationdatefromJan... |
| 3 | 2017-01-01 | ANG MO KIO | 3 ROOM | 465 | ANG MO KIO AVE 10 | 04 TO 06 | 68.0 | New Generation | 1980 | 62 years 01 month | 265000.0 | ResaleflatpricesbasedonregistrationdatefromJan... |
| 4 | 2017-01-01 | ANG MO KIO | 3 ROOM | 601 | ANG MO KIO AVE 5 | 01 TO 03 | 67.0 | New Generation | 1980 | 62 years 05 months | 265000.0 | ResaleflatpricesbasedonregistrationdatefromJan... |
In [8]:
# Basic health checks
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 628591 entries, 0 to 628590 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 month 628591 non-null datetime64[ns] 1 town 628591 non-null object 2 flat_type 628591 non-null object 3 block 628591 non-null object 4 street_name 628591 non-null object 5 storey_range 628591 non-null object 6 floor_area_sqm 628591 non-null float64 7 flat_model 628591 non-null object 8 lease_commence_date 628591 non-null int64 9 remaining_lease 206737 non-null object 10 resale_price 628591 non-null float64 11 source_file 628591 non-null object dtypes: datetime64[ns](1), float64(2), int64(1), object(8) memory usage: 57.5+ MB
1.2 Cleaning & Standardisation¶
- Standardised columns; parsed
month, coerced numerics. - Engineered
flat_age_years,remaining_lease_years.
In [3]:
import numpy as np
# 1️⃣ Convert "month" column to datetime format
df["month"] = pd.to_datetime(df["month"], errors="coerce")
# 2️⃣ Convert numeric columns
for c in ["floor_area_sqm", "lease_commence_date", "resale_price"]:
df[c] = pd.to_numeric(df[c], errors="coerce")
# 3️⃣ Create a real datetime for lease_commence_date (1 Jan of that year)
df["lease_start_date"] = pd.to_datetime(df["lease_commence_date"], format="%Y", errors="coerce")
# 4️⃣ Calculate flat age (years between sale and lease start)
df["flat_age_years"] = np.where(
df["lease_start_date"].notna() & df["month"].notna(),
df["month"].dt.year - df["lease_start_date"].dt.year,
np.nan
)
# 5️⃣ Compute remaining lease = 99 – age and clip between 0 and 99
df["remaining_lease_years"] = (99 - df["flat_age_years"]).clip(lower=0, upper=99)
# 6️⃣ Preview cleaned columns
df_clean = df[[
"month","town","flat_type","storey_range","floor_area_sqm",
"flat_model","lease_commence_date","flat_age_years","remaining_lease_years","resale_price"
]]
print("✅ After cleaning:", df_clean.shape)
df_clean.head(10)
✅ After cleaning: (628591, 10)
Out[3]:
| month | town | flat_type | storey_range | floor_area_sqm | flat_model | lease_commence_date | flat_age_years | remaining_lease_years | resale_price | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2000-01-01 | ANG MO KIO | 3 ROOM | 07 TO 09 | 69.0 | Improved | 1986 | 14.0 | 85.0 | 147000.0 |
| 1 | 2000-01-01 | ANG MO KIO | 3 ROOM | 04 TO 06 | 61.0 | Improved | 1986 | 14.0 | 85.0 | 144000.0 |
| 2 | 2000-01-01 | ANG MO KIO | 3 ROOM | 07 TO 09 | 73.0 | New Generation | 1976 | 24.0 | 75.0 | 159000.0 |
| 3 | 2000-01-01 | ANG MO KIO | 3 ROOM | 07 TO 09 | 73.0 | New Generation | 1976 | 24.0 | 75.0 | 167000.0 |
| 4 | 2000-01-01 | ANG MO KIO | 3 ROOM | 07 TO 09 | 67.0 | New Generation | 1976 | 24.0 | 75.0 | 163000.0 |
| 5 | 2000-01-01 | ANG MO KIO | 3 ROOM | 04 TO 06 | 73.0 | New Generation | 1977 | 23.0 | 76.0 | 157000.0 |
| 6 | 2000-01-01 | ANG MO KIO | 3 ROOM | 07 TO 09 | 73.0 | New Generation | 1977 | 23.0 | 76.0 | 178000.0 |
| 7 | 2000-01-01 | ANG MO KIO | 3 ROOM | 07 TO 09 | 68.0 | New Generation | 1981 | 19.0 | 80.0 | 160000.0 |
| 8 | 2000-01-01 | ANG MO KIO | 3 ROOM | 04 TO 06 | 68.0 | New Generation | 1981 | 19.0 | 80.0 | 169000.0 |
| 9 | 2000-01-01 | ANG MO KIO | 3 ROOM | 07 TO 09 | 82.0 | New Generation | 1981 | 19.0 | 80.0 | 205000.0 |
1.3 Dataset Summary (Key Variables)¶
In [4]:
# Convert all text columns to uppercase and remove extra spaces
text_columns = ["town", "flat_type", "flat_model", "storey_range", "street_name"]
for col in text_columns:
if col in df.columns:
df[col] = (
df[col]
.astype(str) # ensure text
.str.upper() # make consistent casing
.str.strip() # remove spaces
)
# Check unique values after standardization
print("✅ Towns:", df["town"].nunique())
print("✅ Flat types:", df["flat_type"].unique()[:10])
print("✅ Flat models:", df["flat_model"].unique()[:10])
✅ Towns: 26 ✅ Flat types: ['3 ROOM' '4 ROOM' '5 ROOM' 'EXECUTIVE' '2 ROOM' 'MULTI-GENERATION' '1 ROOM'] ✅ Flat models: ['IMPROVED' 'NEW GENERATION' 'MODEL A' 'STANDARD' 'APARTMENT' 'SIMPLIFIED' 'MODEL A-MAISONETTE' 'MAISONETTE' 'MULTI GENERATION' 'ADJOINED FLAT']
In [5]:
# Check how many missing values remain in each column
df.isna().sum().sort_values(ascending=False)
Out[5]:
remaining_lease 421854 town 0 month 0 block 0 street_name 0 storey_range 0 flat_type 0 floor_area_sqm 0 flat_model 0 lease_commence_date 0 resale_price 0 source_file 0 lease_start_date 0 flat_age_years 0 remaining_lease_years 0 dtype: int64
In [6]:
# Drop rows with missing resale price (target variable)
df = df.dropna(subset=["resale_price"])
# Fill missing flat_age or remaining_lease_years with median
df["flat_age_years"] = df["flat_age_years"].fillna(df["flat_age_years"].median())
df["remaining_lease_years"] = df["remaining_lease_years"].fillna(df["remaining_lease_years"].median())
# Quick check again
print("✅ Missing values after cleaning:")
print(df.isna().sum())
✅ Missing values after cleaning: month 0 town 0 flat_type 0 block 0 street_name 0 storey_range 0 floor_area_sqm 0 flat_model 0 lease_commence_date 0 resale_price 0 remaining_lease 421854 source_file 0 lease_start_date 0 flat_age_years 0 remaining_lease_years 0 dtype: int64
In [11]:
# Date + numeric sanity
df['month'] = pd.to_datetime(df['month'], errors='coerce')
df[['resale_price','floor_area_sqm']].describe()
Out[11]:
| resale_price | floor_area_sqm | |
|---|---|---|
| count | 6.285910e+05 | 628591.000000 |
| mean | 3.620510e+05 | 96.767132 |
| std | 1.643300e+05 | 25.051891 |
| min | 2.800000e+04 | 28.000000 |
| 25% | 2.400000e+05 | 74.000000 |
| 50% | 3.380000e+05 | 96.000000 |
| 75% | 4.500000e+05 | 113.000000 |
| max | 1.500000e+06 | 297.000000 |
In [12]:
# Categorical distribution + time coverage
df['town'].value_counts().head(10), df['flat_type'].value_counts().head(10)
Out[12]:
(town WOODLANDS 55028 TAMPINES 48480 JURONG WEST 46656 YISHUN 41503 BEDOK 38939 HOUGANG 34173 SENGKANG 31532 ANG MO KIO 31192 CHOA CHU KANG 29136 BUKIT BATOK 27333 Name: count, dtype: int64, flat_type 4 ROOM 248960 3 ROOM 180081 5 ROOM 144616 EXECUTIVE 46835 2 ROOM 7366 1 ROOM 473 MULTI-GENERATION 260 Name: count, dtype: int64)
In [13]:
# Time span
df['month'].min(), df['month'].max(), len(df)
Out[13]:
(Timestamp('2000-01-01 00:00:00'), Timestamp('2024-01-01 00:00:00'), 628591)
1.3.1 Count by Flat Type¶
In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(8,5))
ax = sns.countplot(
x="flat_type",
hue="flat_type", # ✅ fix for seaborn 0.14+ (palette requires hue)
data=df,
order=df["flat_type"].value_counts().index,
palette="muted",
dodge=False,
legend=False
)
ax.set_title("Distribution of Records by Flat Type")
ax.set_xlabel("Flat Type")
ax.set_ylabel("Number of Transactions")
plt.xticks(rotation=0)
# ✅ annotate counts on bars
for p in ax.patches:
h = p.get_height()
ax.annotate(
f"{h:,}",
(p.get_x() + p.get_width()/2, h),
ha="center", va="bottom", fontsize=9,
xytext=(0, 3), textcoords="offset points"
)
plt.tight_layout()
plt.show()
1.3.2 Resale Price Distribution¶
In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
# helper: format axes with thousands separators
fmt = FuncFormatter(lambda x, _: f"{int(x):,}")
plt.figure(figsize=(8,4))
sns.histplot(
df["resale_price"],
bins=50,
kde=True,
color=sns.color_palette("muted")[0] # soft color, no hue needed
)
plt.title("Distribution of Resale Prices")
plt.xlabel("Resale Price (SGD)")
plt.ylabel("Count")
plt.gca().xaxis.set_major_formatter(fmt)
plt.tight_layout()
plt.show()
1.3.3 Records by Town (Top 10)¶
In [9]:
import seaborn as sns
import matplotlib.pyplot as plt
# Prepare top-10 towns by transaction count
top10_towns = df["town"].value_counts().head(10).index
df_top10 = df[df["town"].isin(top10_towns)]
order = df_top10["town"].value_counts().index # sorted order
plt.figure(figsize=(9,5))
ax = sns.countplot(
x="town",
hue="town", # ✅ fixes Seaborn 0.14 palette warning
data=df_top10,
order=order,
palette="muted",
dodge=False,
legend=False
)
ax.set_title("Top 10 Towns by Number of Transactions")
ax.set_xlabel("Town")
ax.set_ylabel("Number of Transactions")
plt.xticks(rotation=15)
# annotate counts on bars
for p in ax.patches:
h = p.get_height()
ax.annotate(
f"{h:,}",
(p.get_x() + p.get_width()/2, h),
ha="center", va="bottom", fontsize=9,
xytext=(0, 3), textcoords="offset points"
)
plt.tight_layout()
plt.show()
1.3.4 Line chart — price trend over time for selected towns¶
In [10]:
import pandas as pd, seaborn as sns, matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
towns = ["ANG MO KIO","TAMPINES","JURONG WEST","WOODLANDS"] # edit as you like
df_line = df.copy()
df_line["year_month"] = pd.to_datetime(df_line["month"]).dt.to_period("M").dt.to_timestamp()
df_line = df_line[df_line["town"].isin(towns)]
trend = df_line.groupby(["year_month","town"], as_index=False)["resale_price"].mean()
plt.figure(figsize=(9,4))
sns.lineplot(data=trend, x="year_month", y="resale_price", hue="town", marker="o", palette="muted")
plt.title("Average Resale Price by Month — Selected Towns")
plt.xlabel("Month"); plt.ylabel("Avg Price (SGD)")
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x,_: f"{int(x):,}"))
plt.tight_layout(); plt.show()
1.3.5 Bar chart — average prices by town or flat type¶
By town (top 10):
In [11]:
import seaborn as sns, matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
# Top 10 towns by average price
top10 = (df.groupby("town")["resale_price"]
.mean().sort_values(ascending=False)
.head(10).reset_index())
plt.figure(figsize=(9,5))
ax = sns.barplot(
data=top10, y="town", x="resale_price",
hue="town", dodge=False, legend=False, palette="muted"
)
plt.title("Average Resale Price — Top 10 Towns")
plt.xlabel("Avg Price (SGD)"); plt.ylabel("")
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{int(x):,}"))
# --- add data labels ---
for p in ax.patches:
w = p.get_width()
y = p.get_y() + p.get_height()/2
ax.text(w + 2000, y, f"{int(w):,}", va="center", ha="left", fontsize=10)
plt.tight_layout(); plt.show()
In [12]:
by_type = (df.groupby("flat_type")["resale_price"]
.mean().sort_values().reset_index())
plt.figure(figsize=(9,4))
ax = sns.barplot(
data=by_type, x="flat_type", y="resale_price",
hue="flat_type", dodge=False, legend=False, palette="pastel"
)
plt.title("Average Resale Price by Flat Type")
plt.xlabel(""); plt.ylabel("Avg Price (SGD)")
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{int(x):,}"))
# --- add data labels ---
for p in ax.patches:
h = p.get_height()
x = p.get_x() + p.get_width()/2
ax.text(x, h + 2500, f"{int(h):,}", ha="center", va="bottom", fontsize=10)
plt.tight_layout(); plt.show()
1.3.6 Map chart — geographic price distribution across Singapore¶
In [18]:
import plotly.express as px
town_coords = {
"ANG MO KIO":(1.3691,103.8454),"BEDOK":(1.3236,103.9305),"BISHAN":(1.3508,103.8485),
"BUKIT BATOK":(1.3502,103.7490),"BUKIT MERAH":(1.2857,103.8120),"BUKIT PANJANG":(1.3786,103.7616),
"BUKIT TIMAH":(1.3294,103.8021),"CENTRAL AREA":(1.2903,103.8519),"CHOA CHU KANG":(1.3854,103.7449),
"CLEMENTI":(1.3151,103.7643),"GEYLANG":(1.3180,103.8830),"HOUGANG":(1.3626,103.8922),
"JURONG EAST":(1.3331,103.7430),"JURONG WEST":(1.3403,103.7074),"KALLANG/WHAMPOA":(1.3136,103.8640),
"MARINE PARADE":(1.3030,103.9065),"PASIR RIS":(1.3731,103.9497),"PUNGGOL":(1.4053,103.9020),
"QUEENSTOWN":(1.2941,103.7866),"SEMBAWANG":(1.4491,103.8201),"SENGKANG":(1.3933,103.8959),
"SERANGOON":(1.3536,103.8720),"TAMPINES":(1.3531,103.9457),"TOA PAYOH":(1.3346,103.8530),
"WOODLANDS":(1.4360,103.7865),"YISHUN":(1.4304,103.8353)
}
df_map = (df.groupby("town", as_index=False)["resale_price"].mean()
.assign(lat=lambda d: d["town"].map(lambda t: town_coords.get(t,(None,None))[0]),
lon=lambda d: d["town"].map(lambda t: town_coords.get(t,(None,None))[1]))
.dropna(subset=["lat","lon"]))
fig = px.scatter_mapbox(df_map, lat="lat", lon="lon", size="resale_price",
color="resale_price", color_continuous_scale="Viridis",
hover_name="town", hover_data={"resale_price":":,.0f"},
size_max=30, zoom=10, height=500)
fig.update_layout(mapbox_style="open-street-map", title="Avg Resale Price by Town (Map)")
fig.show()
1.3.7 Scatter — floor area vs price, colored by town¶
In [13]:
sample = df.sample(6000, random_state=42) if len(df)>6000 else df
plt.figure(figsize=(9,4))
sns.scatterplot(data=sample, x="floor_area_sqm", y="resale_price",
hue="town", alpha=0.35, s=30, palette="tab20")
plt.title("Floor Area vs Resale Price (colored by Town)")
plt.xlabel("Floor Area (sqm)"); plt.ylabel("Resale Price (SGD)")
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x,_: f"{int(x):,}"))
plt.legend(ncol=2, fontsize=8, bbox_to_anchor=(1.02,1), loc="upper left")
plt.tight_layout(); plt.show()
1.3.8 Heatmap — price per sqm by town × flat type¶
In [14]:
df_ppsqm = df.assign(price_per_sqm = df["resale_price"] / df["floor_area_sqm"])
pivot = (df_ppsqm.groupby(["town","flat_type"])["price_per_sqm"]
.mean().unstack("flat_type"))
plt.figure(figsize=(12,8))
sns.heatmap(pivot, cmap="YlGnBu", annot=False)
plt.title("Average Price per sqm by Town and Flat Type")
plt.xlabel("Flat Type"); plt.ylabel("Town")
plt.tight_layout(); plt.show()
In [13]:
import pandas as pd
# Pick the key variables for Slide 7
summary_cols = ["floor_area_sqm", "flat_age_years", "remaining_lease_years", "resale_price"]
# Safety: keep only columns that exist (in case of naming differences)
available = [c for c in summary_cols if c in df.columns]
# Summary stats
summary = df[available].describe().T # transpose so variables are rows
summary = summary.rename(columns={
"50%": "median"
})[["count", "mean", "std", "min", "median", "max"]].round(2)
# Missing values per column
missing = df[available].isna().sum().rename("missing")
# Merge for one clean table
summary_table = summary.merge(missing, left_index=True, right_index=True)
summary_table
Out[13]:
| count | mean | std | min | median | max | missing | |
|---|---|---|---|---|---|---|---|
| floor_area_sqm | 628591.0 | 96.77 | 25.05 | 28.0 | 96.0 | 297.0 | 0 |
| flat_age_years | 628591.0 | 20.12 | 11.42 | -2.0 | 20.0 | 57.0 | 0 |
| remaining_lease_years | 628591.0 | 78.88 | 11.42 | 42.0 | 79.0 | 99.0 | 0 |
| resale_price | 628591.0 | 362050.97 | 164329.97 | 28000.0 | 338000.0 | 1500000.0 | 0 |
In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
# Prepare yearly average price
df_year = df.copy()
df_year["year"] = pd.to_datetime(df_year["month"]).dt.year
yearly = df_year.groupby("year", as_index=False)["resale_price"].mean()
# Plot
plt.figure(figsize=(8,5))
sns.lineplot(data=yearly, x="year", y="resale_price", marker="o", color=sns.color_palette("muted")[0])
plt.title("Average Resale Price by Year")
plt.xlabel("Year")
plt.ylabel("Average Resale Price (SGD)")
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{int(x):,}"))
plt.tight_layout()
plt.show()
2.2 Floor Area vs Resale Price¶
In [16]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
plt.figure(figsize=(8,4))
sns.scatterplot(
data=df.sample(5000, random_state=42) if len(df) > 5000 else df, # speed + clarity
x="floor_area_sqm",
y="resale_price",
alpha=0.3,
hue=None,
color=sns.color_palette("muted")[1]
)
plt.title("Floor Area vs Resale Price")
plt.xlabel("Floor Area (sqm)")
plt.ylabel("Resale Price (SGD)")
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{int(x):,}"))
plt.tight_layout()
plt.show()
2.3 Correlation Heatmap¶
In [17]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
num_cols = ["floor_area_sqm", "flat_age_years", "remaining_lease_years", "resale_price"]
num_cols = [c for c in num_cols if c in df.columns]
corr = df[num_cols].corr()
plt.figure(figsize=(6,4))
sns.heatmap(
corr, annot=True, fmt=".2f",
cmap="crest", # light, professional
vmin=-1, vmax=1, linewidths=.5
)
plt.title("Correlation — Key Numerical Features")
plt.tight_layout()
plt.show()
3. Model Development & Evaluation¶
- Linear Regression
- Decision Tree Regressor
- Random Forest Regressor
3.1 Linear Regression Model¶
In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# --- Drop irrelevant or redundant columns ---
df_model = df.drop(columns=[
"source_file",
"block",
"street_name",
"remaining_lease" # text version, replaced by numeric one
])
# --- Encode categorical columns ---
label_cols = ["town", "flat_type", "flat_model", "storey_range"]
le = LabelEncoder()
for col in label_cols:
df_model[col] = le.fit_transform(df_model[col])
# --- Define X (features) and y (target) ---
X = df_model.drop(columns=["resale_price"])
y = df_model["resale_price"]
# --- Split data into train and test sets (80% train, 20% test) ---
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print("✅ Data prepared successfully!")
print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")
✅ Data prepared successfully! Training set: (502872, 10) Testing set: (125719, 10)
In [19]:
# Convert datetime columns to numeric (years)
X_train = X_train.copy()
X_test = X_test.copy()
# Extract just the year from date columns
for col in ["month", "lease_commence_date"]:
if col in X_train.columns:
X_train[col] = pd.to_datetime(X_train[col]).dt.year
X_test[col] = pd.to_datetime(X_test[col]).dt.year
# Now fill any missing values caused by conversion
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
print("✅ Date columns converted to numeric successfully!")
✅ Date columns converted to numeric successfully!
In [20]:
# Ensure all datetime columns are converted to year integers
for col in X_train.columns:
if np.issubdtype(X_train[col].dtype, np.datetime64):
X_train[col] = pd.to_datetime(X_train[col]).dt.year.fillna(0).astype(int)
X_test[col] = pd.to_datetime(X_test[col]).dt.year.fillna(0).astype(int)
# Double-check no datetime columns remain
print("Remaining datetime columns:", X_train.select_dtypes(include=['datetime64']).columns.tolist())
Remaining datetime columns: []
In [21]:
# core imports for modeling
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
print("Imports OK")
Imports OK
In [22]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
Out[22]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [23]:
# Check which columns still have datetime types
X_train.select_dtypes(include=['datetime64']).columns
Out[23]:
Index([], dtype='object')
In [24]:
# Check which columns still have datetime types
X_train.select_dtypes(include=['datetime64']).columns
Out[24]:
Index([], dtype='object')
In [25]:
X_train.select_dtypes(include=['datetime64']).columns
Out[25]:
Index([], dtype='object')
In [26]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
Out[26]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [27]:
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# Make predictions
y_pred = lin_reg.predict(X_test)
# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse:,.2f}")
print(f"R² Score: {r2:.4f}")
# Scatter plot to visualize actual vs predicted
plt.figure(figsize=(4,4))
plt.scatter(y_test, y_pred, alpha=0.3, color='royalblue')
plt.xlabel("Actual Resale Price")
plt.ylabel("Predicted Resale Price")
plt.title("Actual vs Predicted Prices — Linear Regression")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()
Root Mean Squared Error (RMSE): 83,858.53 R² Score: 0.7375
3.2 Decision Tree Regression Model¶
In [28]:
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
# --- Train the Decision Tree Model ---
tree_reg = DecisionTreeRegressor(max_depth=10, random_state=42)
tree_reg.fit(X_train, y_train)
# --- Make predictions ---
y_pred_tree = tree_reg.predict(X_test)
# --- Evaluate performance ---
mse_tree = mean_squared_error(y_test, y_pred_tree)
rmse_tree = mse_tree ** 0.5
r2_tree = r2_score(y_test, y_pred_tree)
print(f"Decision Tree RMSE: {rmse_tree:,.2f}")
print(f"Decision Tree R²: {r2_tree:.4f}")
# --- Feature Importance with Color Palette (fixed version) ---
feature_importance = pd.DataFrame({
"Feature": X_train.columns,
"Importance": tree_reg.feature_importances_
}).sort_values(by="Importance", ascending=False)
plt.figure(figsize=(8,4))
sns.barplot(
x="Importance",
y="Feature",
hue="Feature",
data=feature_importance,
palette="coolwarm",
legend=False
)
plt.title("Feature Importance — Decision Tree (Coolwarm Palette)")
plt.xlabel("Relative Importance")
plt.ylabel("Features")
plt.tight_layout()
plt.show()
Decision Tree RMSE: 56,853.22 Decision Tree R²: 0.8793
3.3. Random Forest Regression Model¶
In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# --- Train the Random Forest Model ---
rf_reg = RandomForestRegressor(
n_estimators=100, # number of trees
max_depth=15, # limits tree depth to prevent overfitting
random_state=42,
n_jobs=-1 # use all CPU cores
)
rf_reg.fit(X_train, y_train)
# --- Make predictions ---
y_pred_rf = rf_reg.predict(X_test)
# --- Evaluate performance ---
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"🌲 Random Forest RMSE: {rmse_rf:,.2f}")
print(f"🌲 Random Forest R²: {r2_rf:.4f}")
# --- Feature Importance Visualization ---
feature_importance_rf = pd.DataFrame({
"Feature": X_train.columns,
"Importance": rf_reg.feature_importances_
}).sort_values(by="Importance", ascending=False)
plt.figure(figsize=(8,4))
sns.barplot(
x="Importance",
y="Feature",
hue="Feature",
data=feature_importance_rf,
dodge=False,
palette=sns.color_palette("muted", n_colors=len(feature_importance_rf)),
legend=False
)
plt.title("Feature Importance — Random Forest (Muted Palette)")
plt.xlabel("Relative Importance")
plt.ylabel("Features")
plt.tight_layout()
plt.show()
🌲 Random Forest RMSE: 38,860.97 🌲 Random Forest R²: 0.9436
3.4 Model Comparison¶
In [34]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# --- Gather all model metrics ---
results = pd.DataFrame({
"Model": ["Linear Regression", "Decision Tree", "Random Forest"],
"R² Score": [r2, r2_tree, r2_rf],
"RMSE": [rmse, rmse_tree, rmse_rf]
})
# --- Plot R² comparison (muted palette) ---
plt.figure(figsize=(6,4))
sns.barplot(
x="Model",
y="R² Score",
hue="Model", # ✅ fixes warning
data=results,
palette="muted",
dodge=False,
legend=False
)
plt.title("Model Comparison — R² Score")
plt.ylim(0,1)
plt.tight_layout()
plt.show()
# --- Plot RMSE comparison (pastel palette) ---
plt.figure(figsize=(6,4))
sns.barplot(
x="Model",
y="RMSE",
hue="Model", # ✅ fixes warning
data=results,
palette="pastel",
dodge=False,
legend=False
)
plt.title("Model Comparison — RMSE (Lower is Better)")
plt.tight_layout()
plt.show()
# --- Display summary table ---
display(results.style.background_gradient(cmap="YlGnBu"))
| Model | R² Score | RMSE | |
|---|---|---|---|
| 0 | Linear Regression | 0.737510 | 83858.526719 |
| 1 | Decision Tree | 0.879350 | 56853.222920 |
| 2 | Random Forest | 0.943630 | 38860.968179 |
In [35]:
# --- Annotated R² Comparison Chart (muted palette) ---
plt.figure(figsize=(5,5))
ax = sns.barplot(
x="Model",
y="R² Score",
hue="Model",
data=results,
palette="muted",
dodge=False,
legend=False
)
plt.title("Model Comparison — R² Score")
plt.ylim(0, 1)
# Add text labels on each bar
for p in ax.patches:
ax.text(
p.get_x() + p.get_width() / 2,
p.get_height() + 0.02,
f"{p.get_height():.3f}",
ha="center", va="bottom", fontsize=10
)
plt.tight_layout()
plt.show()
# --- Annotated RMSE Comparison Chart (pastel palette) ---
plt.figure(figsize=(5,6))
ax = sns.barplot(
x="Model",
y="RMSE",
hue="Model",
data=results,
palette="pastel",
dodge=False,
legend=False
)
plt.title("Model Comparison — RMSE (Lower is Better)")
# Add text labels on each bar
for p in ax.patches:
ax.text(
p.get_x() + p.get_width() / 2,
p.get_height() + (0.02 * max(results["RMSE"])), # small offset
f"{p.get_height():,.0f}",
ha="center", va="bottom", fontsize=10
)
plt.tight_layout()
plt.show()
In [31]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Example data (replace with your actual results DataFrame)
results = pd.DataFrame({
"Model": ["Linear Regression", "Decision Tree", "Random Forest"],
"R² Score": [0.737510, 0.879350, 0.943630],
"RMSE": [83858.53, 56853.22, 38860.97]
})
# --- R² Comparison Chart (Muted Palette) ---
plt.figure(figsize=(6,4))
ax = sns.barplot(
x="Model",
y="R² Score",
hue="Model",
data=results,
palette="muted",
dodge=False,
legend=False
)
plt.title("Model Comparison — R² Score", fontsize=13, weight="bold")
plt.ylim(0, 1.05)
plt.ylabel("R² Score")
# Add annotations for exact R² values
for p in ax.patches:
value = p.get_height()
ax.text(
p.get_x() + p.get_width()/2,
value + 0.02, # small offset above bar
f"{value:.3f}",
ha="center", va="bottom",
fontsize=10, color="black", weight="bold"
)
plt.tight_layout()
plt.show()
# --- RMSE Comparison Chart (Pastel Palette) ---
plt.figure(figsize=(6,4))
ax = sns.barplot(
x="Model",
y="RMSE",
hue="Model",
data=results,
palette="pastel",
dodge=False,
legend=False
)
plt.title("Model Comparison — RMSE (Lower is Better)", fontsize=13, weight="bold")
plt.ylabel("RMSE")
# Add annotations for exact RMSE values
for p in ax.patches:
value = p.get_height()
ax.text(
p.get_x() + p.get_width()/2,
value + 1500, # offset based on chart scale
f"{value:,.0f}", # comma format, 0 decimals
ha="center", va="bottom",
fontsize=10, color="black", weight="bold"
)
plt.tight_layout()
plt.show()
In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# --- Extract feature importances from both models ---
tree_importances = pd.Series(tree_reg.feature_importances_, index=X.columns)
rf_importances = pd.Series(rf_reg.feature_importances_, index=X.columns)
# --- Sort top 10 features ---
top_n = 10
tree_top = tree_importances.sort_values(ascending=False).head(top_n)
rf_top = rf_importances.sort_values(ascending=False).head(top_n)
# --- Create side-by-side plots ---
fig, axes = plt.subplots(1, 2, figsize=(15,5))
# Decision Tree
sns.barplot(
x=tree_top.values, y=tree_top.index,
ax=axes[0],
palette="muted",
hue=tree_top.index, legend=False, dodge=False
)
axes[0].set_title("Decision Tree — Top Feature Importances", fontsize=13, weight="bold")
axes[0].set_xlabel("Importance")
axes[0].set_ylabel("Feature")
# Annotate importance values
for i, v in enumerate(tree_top.values):
axes[0].text(v + 0.002, i, f"{v:.3f}", va="center", fontsize=9)
# Random Forest
sns.barplot(
x=rf_top.values, y=rf_top.index,
ax=axes[1],
palette="pastel",
hue=rf_top.index, legend=False, dodge=False
)
axes[1].set_title("Random Forest — Top Feature Importances", fontsize=13, weight="bold")
axes[1].set_xlabel("Importance")
axes[1].set_ylabel("")
for i, v in enumerate(rf_top.values):
axes[1].text(v + 0.002, i, f"{v:.3f}", va="center", fontsize=9)
plt.tight_layout()
plt.show()
In [32]:
import pandas as pd
feature_importance_rf = pd.DataFrame({
"Feature": X_train.columns,
"Importance": rf_reg.feature_importances_
}).sort_values("Importance", ascending=False).head(10)
plt.figure(figsize=(8,4))
sns.barplot(
x="Importance", y="Feature",
hue="Feature", data=feature_importance_rf,
palette="pastel", dodge=False, legend=False
)
plt.title("Random Forest — Top Feature Importances")
plt.tight_layout()
plt.show()
In [ ]: